Blaine Grayson

09-23-2025

HockeyR and packages

library(hockeyR)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Loading and combining data play-by-play-data

pbp20 = load_pbp('2020-21')
pbp21 = load_pbp('2021-22')
pbp22 = load_pbp('2022-23')
pbp23 = load_pbp('2023-24')
pbp_train <- bind_rows(pbp20, pbp21, pbp22)
pbp_test <- pbp23
pbp_test <- pbp_test %>%
  dplyr::select(-secondary_type) %>%
  dplyr::rename(secondary_type = shotType)
pbp_test <- pbp_test |> 
  dplyr::mutate(
    secondary_type = dplyr::case_when(
      secondary_type == "wrist" ~ "Wrist Shot",
      secondary_type == "backhand" ~ "Backhand",
      secondary_type == "snap" ~ "Snap Shot",
      secondary_type == "slap" ~ "Slap Shot",
      secondary_type == "tip-in" ~ "Tip-In",
      secondary_type == "wrap-around" ~ "Wrap-around",
      secondary_type == "deflected" ~ "Deflected",
      secondary_type == "poke" ~ "Poke",
      secondary_type == "bat" ~ "Batted",
      secondary_type == "between-legs" ~ "Between Legs",
      secondary_type == "cradle" ~ "Cradle",
      TRUE ~ secondary_type
    )
  )

Extracting shot types for events in shots and goals

shots_train <- pbp_train |> 
  dplyr::filter(event_type %in% c("SHOT", "GOAL")) |> 
  dplyr::select(event_type,secondary_type,event_player_1_name, xg) |> 
  dplyr::mutate(Y = ifelse(event_type == "GOAL",1,0))

Getting shooting % for each shot type

shots_summary <- shots_train |> 
  dplyr::group_by(secondary_type) |> 
  dplyr::summarize(
    shots = n(),
    goals = sum(Y),
    shooting_pct = mean(Y),
    .groups = "drop"
  )

Attaching predictions back to each shot in the test set

shots_test <- pbp_test |> 
  dplyr::filter(event_type %in% c("SHOT", "GOAL")) |> 
  dplyr::select(event_type, secondary_type, event_player_1_name) |> 
  dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0)) |> 
  left_join(shots_summary, by = "secondary_type")

Computing goals over expected (GOE) by player (model 1)

goe_model1 <- 
  shots_test |> 
  dplyr::mutate(diff = Y - shooting_pct) |> 
  dplyr::group_by(event_player_1_name) |>    
  dplyr::summarise(
    player = unique(event_player_1_name),
    GOE = round(sum(diff),3),
    shots = dplyr::n(),
    goals = sum(Y)
  ) |> 
  dplyr::select(player, shots, goals, GOE) |> 
  dplyr::arrange(dplyr::desc(GOE))

goe_model1

Calculating Log Loss for Model 1 and NHL xG

logloss <- function(y, phat){
  keep <- !is.na(y) & !is.na(phat)
  y <- y[keep]
  phat <- phat[keep]
  if(any(phat < 1e-12)) phat[phat < 1e-12] <- 1e-12
  if(any(phat > 1 - 1e-12)) phat[phat > 1 - 1e-12] <- 1 - 1e-12
  return(-1 * mean(y * log(phat) + (1 - y) * log(1 - phat)))
}

cat("Shot Type Log Loss:",
    round(logloss(shots_test$Y, shots_test$shooting_pct), 3), "\n")
## Shot Type Log Loss: 0.329
cat("NHL xG Log Loss:",
    round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269

Model 2

Preparing data for model 2

pbp_model2 = pbp_train |> 
  select(xg, season, description, event_type, event, secondary_type,shot_distance, shot_angle, event_player_1_name, event_player_1_id) |> 
  filter(event_type %in% c("SHOT", "GOAL")) |> 
  mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_model2

Fitting a linear regression model that will predict on distance, angle, and shot type

model2 = glm(formula = Y~shot_distance + shot_angle + secondary_type, data = pbp_model2, family = binomial("logit"))
model2
## 
## Call:  glm(formula = Y ~ shot_distance + shot_angle + secondary_type, 
##     family = binomial("logit"), data = pbp_model2)
## 
## Coefficients:
##                (Intercept)               shot_distance  
##                   -0.97348                    -0.03798  
##                 shot_angle        secondary_typeBatted  
##                   -0.01317                     0.72395  
## secondary_typeBetween Legs        secondary_typeCradle  
##                    0.03674                     1.24339  
##    secondary_typeDeflected  secondary_typePenalty Shot  
##                    0.24263                     1.12735  
##         secondary_typePoke     secondary_typeSlap Shot  
##                    0.02471                     0.48893  
##    secondary_typeSnap Shot        secondary_typeTip-In  
##                    0.50279                     0.17230  
##  secondary_typeWrap-around    secondary_typeWrist Shot  
##                   -0.55114                     0.22909  
## 
## Degrees of Freedom: 234252 Total (i.e. Null);  234239 Residual
##   (29 observations deleted due to missingness)
## Null Deviance:       152500 
## Residual Deviance: 142800    AIC: 142800

Adding Predicted xG from model2

shot_test2 <- pbp_test |> 
  dplyr::filter(event_type %in% c("SHOT", "GOAL")) |> 
  dplyr::select(description, event_type, event, secondary_type,shot_distance, shot_angle, event_player_1_name, event_player_1_id) |> 
  dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0)) |> 
  left_join(shots_summary, by = "secondary_type")
shot_test2 = shot_test2 |> mutate(xg2 = predict(model2, newdata = shot_test2, type="response")) |> drop_na(xg2)

Computing GOE per player based on model 2

shot_test2 = shot_test2 |> mutate(GOE_ON_PLAY = Y - xg2)

GOE_Leaders <- shot_test2 |> 
  group_by(event_player_1_name) |> 
  summarise(
    GOE = sum(GOE_ON_PLAY),
    shots = n(),
    goals = sum(Y),
    .groups = "drop"
  ) |> 
  mutate(GOE = round(GOE, 3)) |>
  select(player = event_player_1_name, goals, shots, GOE) |> 
  arrange(desc(GOE))
GOE_Leaders

Log Loss for Model 2

cat("Model 2 Log Loss:",
    round(logloss(shot_test2$Y, shot_test2$xg2), digits=3),"\n")
## Model 2 Log Loss: 0.309
cat("NHL xG Log Loss:",
    round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269

Model 3

Preparing data for model 3

pbp_filtered = pbp_train |> 
  select(xg, season, description, event_type, event, secondary_type, event_goalie_name, event_goalie_id, strength_code, strength, shot_distance, shot_angle, event_player_1_name, event_player_1_id) |> 
  filter(event_type %in% c("SHOT", "GOAL"))
pbp_filtered

Adding a column that tracks a goalies save percentage for that year

goalies_train = pbp_filtered |> group_by(event_goalie_name, season) |> 
  summarize(sv_pct = sum(event_type == "SHOT") / n())
## `summarise()` has grouped output by 'event_goalie_name'. You can override using
## the `.groups` argument.
pbp_final_train = left_join(pbp_filtered, goalies_train, by = c("event_goalie_name", "season"))
pbp_final_train = pbp_final_train |> mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_final_train

Fitting logistic regression model3

model3 = glm(formula = Y~shot_distance + shot_angle + secondary_type + sv_pct + strength, data = pbp_final_train, family = binomial("logit"))
model3
## 
## Call:  glm(formula = Y ~ shot_distance + shot_angle + secondary_type + 
##     sv_pct + strength, family = binomial("logit"), data = pbp_final_train)
## 
## Coefficients:
##                (Intercept)               shot_distance  
##                   11.11790                    -0.05388  
##                 shot_angle        secondary_typeBatted  
##                   -0.01233                     0.70745  
## secondary_typeBetween Legs        secondary_typeCradle  
##                   -0.09129                     1.31750  
##    secondary_typeDeflected  secondary_typePenalty Shot  
##                    0.32361                     1.17451  
##         secondary_typePoke     secondary_typeSlap Shot  
##                   -0.15978                     0.94691  
##    secondary_typeSnap Shot        secondary_typeTip-In  
##                    0.75162                     0.21205  
##  secondary_typeWrap-around    secondary_typeWrist Shot  
##                   -0.59727                     0.36483  
##                     sv_pct          strengthPower Play  
##                  -13.33082                     0.33313  
##        strengthShorthanded  
##                    0.19792  
## 
## Degrees of Freedom: 233568 Total (i.e. Null);  233552 Residual
##   (713 observations deleted due to missingness)
## Null Deviance:       151000 
## Residual Deviance: 130500    AIC: 130500

Preparing test data for model3

pbp_filtered_test <- pbp_test |> 
  dplyr::select(description, event_type, event, secondary_type,
                event_goalie_name, event_goalie_id, strength_code, strength,
                shot_distance, shot_angle, event_player_1_name, event_player_1_id) |> 
  dplyr::filter(event_type %in% c("SHOT", "GOAL")) |> 
  dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_filtered_test

calculating goalie save percentage for test set

goalies_test = pbp_filtered_test |> group_by(event_goalie_name) |> 
  summarize(sv_pct = sum(event_type == "SHOT") / n())
pbp_final_test = left_join(pbp_filtered_test, goalies_test, by = c("event_goalie_name"))
pbp_final_test = pbp_final_test |> mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_final_test

Adding predicted xG from model 3 and Computing GOE per player

pbp_final_test = pbp_final_test |> mutate(xg3 = predict(model3, newdata = pbp_final_test, type="response")) |> drop_na(xg3)
pbp_final_test = pbp_final_test |> mutate(GOE_ON_PLAY = Y - xg3)

GOE_Leaders_final <- pbp_final_test |> 
  group_by(event_player_1_name) |> 
  summarise(
    GOE = sum(GOE_ON_PLAY),
    shots = n(),
    goals = sum(Y),
    .groups = "drop"
  ) |> 
  mutate(GOE = round(GOE, 3)) |>
  select(player = event_player_1_name, goals, shots, GOE) |> 
  arrange(desc(GOE))
GOE_Leaders_final

Log Loss model 3

cat("Model 3 Log Loss:",
    round(logloss(pbp_final_test$Y, pbp_final_test$xg3), digits=3),"\n")
## Model 3 Log Loss: 0.269
cat("NHL xG Log Loss:",
    round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269

Forest Data

shot_vars = 
  c("Y", "secondary_type", "strength", "shot_distance", "shot_angle", "sv_pct")

train_data_tree <- pbp_final_train |> 
  dplyr::select(all_of(shot_vars))

y_train = train_data_tree$Y

model_3_tree = ranger::ranger(formula = Y~., data = train_data_tree, probability = TRUE)
## Growing trees.. Progress: 85%. Estimated remaining time: 5 seconds.
pbp_test_tree <- pbp_final_test |> 
  mutate(xg_model3 = predict(object = model_3_tree, data = pbp_final_test)$predictions[,2])
goe_model3 = pbp_test_tree |> 
  dplyr::mutate(diff = Y - xg_model3) |> 
  dplyr::group_by(event_player_1_name) |> 
  dplyr::summarise(
    GOE = round(sum(diff),3),
    shots = n(),
    goals = sum(Y)
  ) |> 
  dplyr::arrange(desc(GOE))

goe_model3

Player Testing

goe_model3 |> filter(event_player_1_name == "Matthew Tkachuk")
cat("Model 3 Log Loss:",
    round(logloss(pbp_test_tree$Y, pbp_test_tree$xg_model3), digits=3),"\n")
## Model 3 Log Loss: 0.279